
Figure 2: An example of a directory /joe and Ale /joe/foo. 
Bach replica of joe stores three pointers to the gold replicas of 
f oo. Bach replica of foo keeps a backpointer to the parent di- 
rectory. Bronze replicas are connected randomly to form strongly 
connected graphs. Bronze replicas also have uni-dircctiop*i i«»ir* 
*o the gold replicas of the file, which are not chnu/« 
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Figure 3 



proc UpdateReplica 

r 2 : Replica // New replica contents, 
preconditions: 

V (pfid,fname): r 2 .bptrs • pfid e dom(D/SK) 
and lsLive(r 2 ) lsLive(DISK(pfid)) 
postconditions: 

r 2 .bptr^{} < 3 > 



♦ "(vali, - - » vain)" represents a tuple of values. 

♦ "IP type" represents a (possibly empty) set of type. 
*TP 1 type" represents a nonempty set of type. 
"Key >-» Van represents a one-io-many mapping from 
type Key to Val. 

♦ "dom(F)" returns the domain of function (or mapping) F 3 
and "ran(F)" returns the range of F. For instance, 

dom({l ~ 3,2 h* 8,4 ~ 3}) - {1,2,4}, 
ran({l ^ 3,2 ^ 8,4 ^ 3» - {3, 8}. 

♦ "X ©Y" substitutes a part of mappingX by Y. E.g., 

{1h*3,2~ l}e{l>-»5,3~4} 
-{1^5,2^1,3^4}. 

♦ "X <aY" means fimcfion-domain restriction. E.g., 

{2} ^{1m3,2k 8,4 ^ 6} - {1 ~ 3,4 6}. 

♦ "Vvar; sef «expr" means that expr holds for var in s< 
Eg., 

V n : {11 , 13, 17} • IsPtime(n). 

♦ "O expr" means thai expr holds eventually. 

♦ "{var; set •e*pr} , *m^nssetconq3trehension. E.g., 

{* : {1,2,3} .J}- {1,4,9}. 



type Replica = record 
fid^ : FilelD 
peers® : F NodelD 

gpeers® : F x NodelD 

bptrs^ : F Backptr 

deadBptr< 5 > : Backptr 

fs<6) • Timestamp 
type RegularReplica inherits Replica = 

contents® : Data 

Invariants: 

-»lsl_ive(r) => contents = {} 
type DirReplica inherits Replica = 

entsW : (FilelD,String) >-> DEntry 

Invariants: 

-.IsLive(r) ents = {} 
type Backptr = (FilelD, String) 
type Dentry = record 

valid® ; bool 

ts( l °) : Timestamp 

gpeersW :F l NodelD 
proc IsLive(r) 

return r is the root or r.bptrs ^ {} 
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/joe /bob 
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DISK: FilelD ~ Replica 
CLOG: FilelD >-* F 1 NodelD 
ULOG: FilelD P Backptr 
Invariants:: 

// Updates are only for existing replicas. 
dom(CLOG) u dom(ULOG) C dom(D/SK) < 12 > 



proc Create 

d: DirReplica // The local replica of the parent directory. 
fname: string // The name of the new file in d 
gpeers: F l NodelD // The placement of the replicas of the file, 
preconditions: 
IsLive(d) < 13 > 



r ^- NewreplicaQ 
r.fid <r- NewfileidQ 
r.gpeers <— gpeers 
r.ts <— NewtimestampQ 
r. peers <— {} 

r. bptrs <- { (d. fid, fname) } 
r.contents <— None 
UpdateReplica(r) 
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proc Unlink 

f: Replica // The file to be unlinked. 
d: DirRe plica // The directory the file belongs to. 
fname: string // f's name in d. 
preconditions: 
IsLive(d) 

f is a directory f.ents = {} 
(d. fid, fname) e f.bptrs 



f <— Deepcopy(f) 

f.bptrs «- f.bptrs \ {{d. fid, fname)} 
if f.bptrs = {} then 

f.deadBptr^- (d. fid, fname) 
f.ts <— NewtimestampO 
UpdateReplica(f) , 



proc Hardlink 

f: RegularReplica // The replica of the file. 

d: DirReplica // The directory to which f will be linked to. 

fname: string // The filename within d. 
preconditions: 

IsLive(d) 



F <— Deepcopy(r) 

F.bptrs «- f.bptrs u {{d.fid, fname)} 
F.ts <— NewtimestampQ 
UpdateReplica(0 



proc Rename 

f: Replica // The file to be moved. 

dp: DirReplica // The origin dir. 

cIt- DirReplica // The destination dir. 

fnameF-' string // The filename in d F 

fname T : string //The filename in d T 
preconditions: 

lsLive(di?) and IsLive(dT) 

(dp -fid, fname F ) e f.bptrs 



f <— Deepcopy(f) 

f.bptrs <- f.bptrs \ {(d F .fid,fname F )} u {{dT.fid,fname T )} 

f'.ts <— Newtimestamp() 

UpdateReplica(f) 
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proc Write 

f: RegularReplica 

newcontents: Data 



f'<r- Deepcopy(Z) 

f. contents <— newcontents 

f'.ts <— Newtimestamp() 

UpdateReplica(0 



proc UpdateReplica 

r 2 : R plica // New replica contents, 
preconditions: 

// All parent directories are stored locally. 
// Moreover, if r 2 is live, then parent must also be live. 
V (pfid,fname): r 2 .bptrs • pfid e dom (DISK) 
and lsUve(r 2 ) => lsLive(DISK(pfid)) < 14 > 



if r 2 .fid # dom(D/SK) then 

// The replica isn't locally stored yet. 
DISK «- DISK u { r 2 .fid »-» r 2 } 
lssueCupdate(r2) 
return 

r x «- DISK(r 2 .fid) 

if File is regular then 

Do some application-specific stuff. 

We can potentially use version vectors here, 
else 

// Union dir entries, taking ones with newer timestamps on conflict, 
for (key \-> e) e r 2 .ents 
if key dom^.ente) or r\.ents(key).ts < e.ts 

ri.ents <- r\ .ents © {key h-> e} 
for each added or deleted entry (fid, f name) in r\ .ents 

// Entry (fid, f name) is potentially inconsistent. Fix up later, 
if fide dom (DISK) then 

lssueUupdate(D/SK(/7c/), {})< 15> 

if r 2 .te>ri.fethen< 16 > 

// The file's attributes are to be updated. 
A.te <— r 2 .ts 

if r x .gpeers ^ r 2 .gpeers then 
r x . gpeers <- r 2 . gpeers 

// When the replica's gold-peer set changes, I must reflect the 
// change to the parent dir entry. 
lssueUupdate(r, {}) 

// Resolve potential conflicts on back pointers 

if ri.bptrs ^ r 2 .bptrs or r\.deadBptr^ r 2 .deadBptr then 

lssuellupdate(ri, ri.bptrs \ r 2 .bptrs)W 

ri.bptrs <— r 2 .bptrs 

ri.deadBptr^- r 2 .deadBptr 

// If the last link to the replica is gone, erase the contents, 
if -i IsLive(ri) then 

if Ti is a regular file then 
ri. contents <- None 

else 

for e e ri.ents • e.valid 

lssueUupdate(DISK(e.fid), {})< 18 > 
ri.ents <— {} 

if Any of r x 5 s attributes has changed then 
lssueCupdate(ri)< 19 > 



Protocol for adding a replica. 

Constants : 

M: Number of neighbors per replica. 

MAXHOPS: The number of hops per a random walk (the 
usual value is 3) 

# 

# AddReplica is the main procedure that adds 

# a replica of file F on the executing node. 
# 

AddReplica (F, G) 

G: the set of gold replicas of F. 

(G is obtained by looking up the parent directory) 

g = Pick a random live node in G. 
Send to g, "CreateReplica (F, myself)" 
Wait for the contents to arrive. 
Store contents and reply the client. 

r = find the replica of F. 
Send to g, "StartRandomWalk ( F, myself)" 
Wait for the set of neighbors N to arrive, 
for n in N: 

Add edge to n in r. 

Send to n, "AddEdge(F, myself)" 

SendReplicaContents ( F, Sender) : 
F: the ID of the file 

Sender: the node requesting replica creation, 
r = find the replica of F 

n = pick the replica closest to Sender among 
graphneighbors of r. 

Send to n, "SendReplicaContents ( F, Sender)" 

SendReplicaContents ( F, Sender) : 
F: the ID of the file 

Sender: the node requesting replica creation. 

r = find the replica of F 
Send r to Sender . 

StartRandomWalk (F, Sender) : 

F: the ID of the file 
Sender: the node requesting replica creation. 



r = find the replica of F 
N = {} 

for i = 0 to M-2: 

n = pick random graph neighbor in r. 

Send to n, "DoRandomWalk (F, 0, myself)" 

Receive nodeid from n. 

Add nodeid to N. 
Send N to Sender. 

DoRandomWalk (F, hops, prevHopNode) : 
F: the ID of the file 

hops: the number of hops made so far. 

if hops == MAXHOPS 

Send myself to prevHopNode 

else 

r = find the replica of F. 

n = pick random graph neighbor in r 

Send to n, "DoRandomWalk ( F, hops + 1, myself)" 

Receive nodeid from n. 

Send nodeid to prevHopNode 

AddEdge(F, peer): 

F: the ID of the file 
peer: the node to span edge to 
r = find the replica of F 
Add edge to peer in r 



proc IssueCupdate 

r: Replica // The replica of the file being updated. 



CLOG(r.fid) <— r.gpeers u r. peers 



proc PropagateCupdate // Runs periodically in the background 



for (fid i — ► targets) e CLOG 
r<-DISK(fid)// See (12). 

// Send the location of the parent dirs so that the target can 
// replicate them to ensure name-space containment. 
pDirs «- {p: r.bptrs • (p.//d, DISK(p.fid).gpeers)} // See (14). 
for n e targets 

send (CUPDATE, r, pDirs) to n. 
when receive ( CUPDATE-REPLY, ts) from node n 
if CLOG (fid). ts=ts 

CLOG(fid) 4- CLOG(fid) \ {n} 

Remove f/d from CLOG when CLOG(fid) becomes empty 



when receive (CUPDATE, r, pDirs) 
r: Replica // New replica contents. 

pDirs: F (FilelD, P NodelD) //Name and location of parent dirs. 



for (pfid, ppeers) e pDirs 
CreateReplica(p/7o', ppeers) 
ResurrectDirectory(pf/cf) 

UpdateReplica(r) 

sen6(CUPDATE-REPLY, r.ts) 
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proc IssueUupdate 

r: Replica // The replica of the file 

del: P (FilelD,String) // Backpointers deleted from the replica. 



if r.fide 6om(ULOG) then 
del «- del u ULOG(r.fid) 
ULOG 4- ULOG e {r.fid ^ del} 

proc ProcessUupdate // Called periodically in the background. 



for (fid »-» de/) e OLOG 
r DISK(fid) // See (12). 
for p/7d, /name e de/ u r.bp/rs 

ResurrectDirectory (pf/'d) 

d <- DISK(pfid) 

valid = pfid e r.bptrs // Is this entry to be added? 
r?ew= ({f id, f name) i-> Dentry (valid, r.ts, r.gpeers)} 
d.ents <— ((f id, f name) o d.ents) u new 
if d.ents has changed 
d.fc <— Newtimestamp() 
IssueCupdate(d) 
l/Z.OG<-{} 



proc CreateReplica 

fid: FilelD // The ID of the file 

peers: Pj NodelD // The known set of gold peers of the file 
postconditions: 

fid e dom {DISK) 



if fid € dom(D/SK) then 
return 

send (SEND-CONTENTS, fid) to random node n <E peers 
Wait until receive (CONTENTS, r, pDirs) from n 
for (pfid, ppeers) e pDirs 

CreateReplica (pfid, ppeers) 
UpdateReplica(r) 

Add edges between r and random existing replicas, 
when receive {SEND-CONTENTS, fid) from node n 



r +- DISK{fid) 

pDirs {p: r.bptrs • {p. fid, DISK{p.fid).gpeers)} // See (14). 
send (CONTENTS, r, pDirs) ton. 



proc ResurrectDirectory 

fid: FilelD 
preconditions: 

fid e dom (DISK) 
fid is a directory 
postconditions: 
lsLive(D/SK(/7d)) 



r <- DISK(fid) 
if IsLive(r) then 
return 

ResurrectDirectory(r.deadBpfr.p/7d) 
r.^pfrs <— { r.deadBptr } 
r.ts <- Newtimestamp() 
I ssueC update (r) 
let (p/7d, fname) = r.deadBptr • 
d <- DISK(pfid) 

d.ents((fid,fname)) <- Dentry(frae, r.te, r.gpeers) < 20 > 

d.fs <- Newtimestamp() 

IssueCupdate(d) 
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// Called every third night for every replica on the node, 
proc GarbageCol lection 

LiveNodes: F 1 NodelD // Set of live nodes. 

r: Replica // Replica to be inspected. 

EXPIRE: integer // Dead-replica expiration period, e.g., a month. 



// Remove old tombstones. Removing after EXPIRE seconds is safe 
// because we cannot receive any new update with timestamp older 
// than EXPIRE after removing r. 

if -i IsLive(r) and r.ts < Newtimestamp - EXPIRE then 
DISK «- {r. fid} ^ DISK 
return 

f «- Deepcopy(r) 

// Remove dead entries in the directory 
if f is a directory then 
for (key \-+ val): r'.ents • 

if -i val.valid and val.ts < NewtimestampQ - EXPIRE then 
r'.ents <— {key} o r'.ents 
if at least one entry has been removed from r'.ents then 
r'.ts <— NewTimestampQ 
UpdateReplica(r) 

// If some gold peers are found dead, recreate one elsewhere 
if me e r.gpeers and r.gpeers % Uvenodes then 

newNodes <— Pick \\r.gpeers \ Livenodes\\ random live nodes. 

r'.gpeers <— r.gpeers \ Uvenodes u newNodes 

r'.ts <— NewTimestamp() 

UpdateReplica(r) 

// If we find graph edges to dead nodes, re-span it. 
for n e r.peers \ Uvenodes 

Add edges between r and a random replica. 

r.peers <— r.peers n Uvenodes 



V f: ran(D/SK) and IsLive(f) • 
( (d.fid, fname) e f.bptrs 

d e dom(D/SK) and IsLive(d) ) 



V d: ran(D/SK) and IsLive(d) • 
( f: ran(D/SK) and IsLive(f) • 

((d.fid,fname)^ ent) e d.ents and entvalid 
and (d. fid, f name) £ f.bptr 

=>■ O {f.fid,fname}\-+ ent) £ d.ents ) 



V 5< t3 



V d: ran(D/SK) and IsLive(d) • 
( {f.fid,fname)*-+ ent) e d.ents and entvalid 
=» f: ran(DISK') and IsLive(f) ) 
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